1. Data Understanding & Preparation¶

1.1 Sources & Scope¶

  • 4 HDB resale CSVs (2000–2024), ~628k rows, 26 towns, 7 flat types.
In [7]:
from pathlib import Path
import pandas as pd

data_dir = Path(".")   # CSVs live alongside the notebook

files = [
    "resale-flat-prices-approval-2000-2005.csv",
    "resale-flat-prices-approval-2006-2012.csv",
    "resale-flat-prices-based-on-registration-date-from-mar-2012-to-dec-2014.csv",
    "resale-flat-prices-based-on-registration-date-from-jan-2015-to-dec-2016.csv",
    "ResaleflatpricesbasedonregistrationdatefromJan2017onwards.csv",
]

# (Keep the rest of your cleaning + concat loop the same, using pd.read_csv(data_dir / f))
In [6]:
data_dir = Path(".")
files = sorted(p.name for p in data_dir.glob("*.csv"))
print(files)  # sanity check

df_list = []
expected_cols = [
    "month","town","flat_type","block","street_name","storey_range",
    "floor_area_sqm","flat_model","lease_commence_date","resale_price","remaining_lease",
]
for f in files:
    pdf = pd.read_csv(data_dir / f, encoding="utf-8", low_memory=False)
    pdf.columns = pdf.columns.str.strip().str.lower().str.replace(" ", "_", regex=False)
    for col in expected_cols:
        if col not in pdf.columns:
            pdf[col] = pd.NA
    pdf["month"] = pd.to_datetime(pdf["month"], errors="coerce")
    pdf["resale_price"] = pd.to_numeric(pdf["resale_price"], errors="coerce")
    pdf["floor_area_sqm"] = pd.to_numeric(pdf["floor_area_sqm"], errors="coerce")
    pdf["lease_commence_date"] = pd.to_numeric(pdf["lease_commence_date"], errors="coerce")
    pdf["source_file"] = f
    df_list.append(pdf)

df = pd.concat(df_list, ignore_index=True)
print("✅ Combined data shape:", df.shape)
df.head()
['ResaleflatpricesbasedonregistrationdatefromJan2017onwards.csv', 'resale-flat-prices-approval-2000-2005.csv', 'resale-flat-prices-approval-2006-2012.csv', 'resale-flat-prices-based-on-registration-date-from-jan-2015-to-dec-2016.csv', 'resale-flat-prices-based-on-registration-date-from-mar-2012-to-dec-2014.csv']
✅ Combined data shape: (628591, 12)
Out[6]:
month town flat_type block street_name storey_range floor_area_sqm flat_model lease_commence_date remaining_lease resale_price source_file
0 2017-01-01 ANG MO KIO 2 ROOM 406 ANG MO KIO AVE 10 10 TO 12 44.0 Improved 1979 61 years 04 months 232000.0 ResaleflatpricesbasedonregistrationdatefromJan...
1 2017-01-01 ANG MO KIO 3 ROOM 108 ANG MO KIO AVE 4 01 TO 03 67.0 New Generation 1978 60 years 07 months 250000.0 ResaleflatpricesbasedonregistrationdatefromJan...
2 2017-01-01 ANG MO KIO 3 ROOM 602 ANG MO KIO AVE 5 01 TO 03 67.0 New Generation 1980 62 years 05 months 262000.0 ResaleflatpricesbasedonregistrationdatefromJan...
3 2017-01-01 ANG MO KIO 3 ROOM 465 ANG MO KIO AVE 10 04 TO 06 68.0 New Generation 1980 62 years 01 month 265000.0 ResaleflatpricesbasedonregistrationdatefromJan...
4 2017-01-01 ANG MO KIO 3 ROOM 601 ANG MO KIO AVE 5 01 TO 03 67.0 New Generation 1980 62 years 05 months 265000.0 ResaleflatpricesbasedonregistrationdatefromJan...
In [8]:
# Basic health checks
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 628591 entries, 0 to 628590
Data columns (total 12 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   month                628591 non-null  datetime64[ns]
 1   town                 628591 non-null  object        
 2   flat_type            628591 non-null  object        
 3   block                628591 non-null  object        
 4   street_name          628591 non-null  object        
 5   storey_range         628591 non-null  object        
 6   floor_area_sqm       628591 non-null  float64       
 7   flat_model           628591 non-null  object        
 8   lease_commence_date  628591 non-null  int64         
 9   remaining_lease      206737 non-null  object        
 10  resale_price         628591 non-null  float64       
 11  source_file          628591 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(8)
memory usage: 57.5+ MB

1.2 Cleaning & Standardisation¶

  • Standardised columns; parsed month, coerced numerics.
  • Engineered flat_age_years, remaining_lease_years.
In [3]:
import numpy as np

# 1️⃣ Convert "month" column to datetime format
df["month"] = pd.to_datetime(df["month"], errors="coerce")

# 2️⃣ Convert numeric columns
for c in ["floor_area_sqm", "lease_commence_date", "resale_price"]:
    df[c] = pd.to_numeric(df[c], errors="coerce")

# 3️⃣ Create a real datetime for lease_commence_date (1 Jan of that year)
df["lease_start_date"] = pd.to_datetime(df["lease_commence_date"], format="%Y", errors="coerce")

# 4️⃣ Calculate flat age (years between sale and lease start)
df["flat_age_years"] = np.where(
    df["lease_start_date"].notna() & df["month"].notna(),
    df["month"].dt.year - df["lease_start_date"].dt.year,
    np.nan
)

# 5️⃣ Compute remaining lease = 99 – age and clip between 0 and 99
df["remaining_lease_years"] = (99 - df["flat_age_years"]).clip(lower=0, upper=99)

# 6️⃣ Preview cleaned columns
df_clean = df[[
    "month","town","flat_type","storey_range","floor_area_sqm",
    "flat_model","lease_commence_date","flat_age_years","remaining_lease_years","resale_price"
]]
print("✅ After cleaning:", df_clean.shape)
df_clean.head(10)
✅ After cleaning: (628591, 10)
Out[3]:
month town flat_type storey_range floor_area_sqm flat_model lease_commence_date flat_age_years remaining_lease_years resale_price
0 2000-01-01 ANG MO KIO 3 ROOM 07 TO 09 69.0 Improved 1986 14.0 85.0 147000.0
1 2000-01-01 ANG MO KIO 3 ROOM 04 TO 06 61.0 Improved 1986 14.0 85.0 144000.0
2 2000-01-01 ANG MO KIO 3 ROOM 07 TO 09 73.0 New Generation 1976 24.0 75.0 159000.0
3 2000-01-01 ANG MO KIO 3 ROOM 07 TO 09 73.0 New Generation 1976 24.0 75.0 167000.0
4 2000-01-01 ANG MO KIO 3 ROOM 07 TO 09 67.0 New Generation 1976 24.0 75.0 163000.0
5 2000-01-01 ANG MO KIO 3 ROOM 04 TO 06 73.0 New Generation 1977 23.0 76.0 157000.0
6 2000-01-01 ANG MO KIO 3 ROOM 07 TO 09 73.0 New Generation 1977 23.0 76.0 178000.0
7 2000-01-01 ANG MO KIO 3 ROOM 07 TO 09 68.0 New Generation 1981 19.0 80.0 160000.0
8 2000-01-01 ANG MO KIO 3 ROOM 04 TO 06 68.0 New Generation 1981 19.0 80.0 169000.0
9 2000-01-01 ANG MO KIO 3 ROOM 07 TO 09 82.0 New Generation 1981 19.0 80.0 205000.0

1.3 Dataset Summary (Key Variables)¶

In [4]:
# Convert all text columns to uppercase and remove extra spaces
text_columns = ["town", "flat_type", "flat_model", "storey_range", "street_name"]

for col in text_columns:
    if col in df.columns:
        df[col] = (
            df[col]
            .astype(str)          # ensure text
            .str.upper()          # make consistent casing
            .str.strip()          # remove spaces
        )

# Check unique values after standardization
print("✅ Towns:", df["town"].nunique())
print("✅ Flat types:", df["flat_type"].unique()[:10])
print("✅ Flat models:", df["flat_model"].unique()[:10])
✅ Towns: 26
✅ Flat types: ['3 ROOM' '4 ROOM' '5 ROOM' 'EXECUTIVE' '2 ROOM' 'MULTI-GENERATION'
 '1 ROOM']
✅ Flat models: ['IMPROVED' 'NEW GENERATION' 'MODEL A' 'STANDARD' 'APARTMENT' 'SIMPLIFIED'
 'MODEL A-MAISONETTE' 'MAISONETTE' 'MULTI GENERATION' 'ADJOINED FLAT']
In [5]:
# Check how many missing values remain in each column
df.isna().sum().sort_values(ascending=False)
Out[5]:
remaining_lease          421854
town                          0
month                         0
block                         0
street_name                   0
storey_range                  0
flat_type                     0
floor_area_sqm                0
flat_model                    0
lease_commence_date           0
resale_price                  0
source_file                   0
lease_start_date              0
flat_age_years                0
remaining_lease_years         0
dtype: int64
In [6]:
# Drop rows with missing resale price (target variable)
df = df.dropna(subset=["resale_price"])

# Fill missing flat_age or remaining_lease_years with median
df["flat_age_years"] = df["flat_age_years"].fillna(df["flat_age_years"].median())
df["remaining_lease_years"] = df["remaining_lease_years"].fillna(df["remaining_lease_years"].median())

# Quick check again
print("✅ Missing values after cleaning:")
print(df.isna().sum())
✅ Missing values after cleaning:
month                         0
town                          0
flat_type                     0
block                         0
street_name                   0
storey_range                  0
floor_area_sqm                0
flat_model                    0
lease_commence_date           0
resale_price                  0
remaining_lease          421854
source_file                   0
lease_start_date              0
flat_age_years                0
remaining_lease_years         0
dtype: int64
In [11]:
# Date + numeric sanity
df['month'] = pd.to_datetime(df['month'], errors='coerce')
df[['resale_price','floor_area_sqm']].describe()
Out[11]:
resale_price floor_area_sqm
count 6.285910e+05 628591.000000
mean 3.620510e+05 96.767132
std 1.643300e+05 25.051891
min 2.800000e+04 28.000000
25% 2.400000e+05 74.000000
50% 3.380000e+05 96.000000
75% 4.500000e+05 113.000000
max 1.500000e+06 297.000000
In [12]:
# Categorical distribution + time coverage
df['town'].value_counts().head(10), df['flat_type'].value_counts().head(10)
Out[12]:
(town
 WOODLANDS        55028
 TAMPINES         48480
 JURONG WEST      46656
 YISHUN           41503
 BEDOK            38939
 HOUGANG          34173
 SENGKANG         31532
 ANG MO KIO       31192
 CHOA CHU KANG    29136
 BUKIT BATOK      27333
 Name: count, dtype: int64,
 flat_type
 4 ROOM              248960
 3 ROOM              180081
 5 ROOM              144616
 EXECUTIVE            46835
 2 ROOM                7366
 1 ROOM                 473
 MULTI-GENERATION       260
 Name: count, dtype: int64)
In [13]:
# Time span
df['month'].min(), df['month'].max(), len(df)
Out[13]:
(Timestamp('2000-01-01 00:00:00'), Timestamp('2024-01-01 00:00:00'), 628591)

1.3.1 Count by Flat Type¶

In [7]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(8,5))
ax = sns.countplot(
    x="flat_type",
    hue="flat_type",                 # ✅ fix for seaborn 0.14+ (palette requires hue)
    data=df,
    order=df["flat_type"].value_counts().index,
    palette="muted",
    dodge=False,
    legend=False
)
ax.set_title("Distribution of Records by Flat Type")
ax.set_xlabel("Flat Type")
ax.set_ylabel("Number of Transactions")
plt.xticks(rotation=0)

# ✅ annotate counts on bars
for p in ax.patches:
    h = p.get_height()
    ax.annotate(
        f"{h:,}",
        (p.get_x() + p.get_width()/2, h),
        ha="center", va="bottom", fontsize=9,
        xytext=(0, 3), textcoords="offset points"
    )

plt.tight_layout()
plt.show()
No description has been provided for this image

1.3.2 Resale Price Distribution¶

In [8]:
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter

# helper: format axes with thousands separators
fmt = FuncFormatter(lambda x, _: f"{int(x):,}")

plt.figure(figsize=(8,4))
sns.histplot(
    df["resale_price"],
    bins=50,
    kde=True,
    color=sns.color_palette("muted")[0]  # soft color, no hue needed
)
plt.title("Distribution of Resale Prices")
plt.xlabel("Resale Price (SGD)")
plt.ylabel("Count")
plt.gca().xaxis.set_major_formatter(fmt)
plt.tight_layout()
plt.show()
No description has been provided for this image

1.3.3 Records by Town (Top 10)¶

In [9]:
import seaborn as sns
import matplotlib.pyplot as plt

# Prepare top-10 towns by transaction count
top10_towns = df["town"].value_counts().head(10).index
df_top10 = df[df["town"].isin(top10_towns)]
order = df_top10["town"].value_counts().index  # sorted order

plt.figure(figsize=(9,5))
ax = sns.countplot(
    x="town",
    hue="town",               # ✅ fixes Seaborn 0.14 palette warning
    data=df_top10,
    order=order,
    palette="muted",
    dodge=False,
    legend=False
)
ax.set_title("Top 10 Towns by Number of Transactions")
ax.set_xlabel("Town")
ax.set_ylabel("Number of Transactions")
plt.xticks(rotation=15)

# annotate counts on bars
for p in ax.patches:
    h = p.get_height()
    ax.annotate(
        f"{h:,}",
        (p.get_x() + p.get_width()/2, h),
        ha="center", va="bottom", fontsize=9,
        xytext=(0, 3), textcoords="offset points"
    )

plt.tight_layout()
plt.show()
No description has been provided for this image

1.3.4 Line chart — price trend over time for selected towns¶

In [10]:
import pandas as pd, seaborn as sns, matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter

towns = ["ANG MO KIO","TAMPINES","JURONG WEST","WOODLANDS"]  # edit as you like
df_line = df.copy()
df_line["year_month"] = pd.to_datetime(df_line["month"]).dt.to_period("M").dt.to_timestamp()
df_line = df_line[df_line["town"].isin(towns)]
trend = df_line.groupby(["year_month","town"], as_index=False)["resale_price"].mean()

plt.figure(figsize=(9,4))
sns.lineplot(data=trend, x="year_month", y="resale_price", hue="town", marker="o", palette="muted")
plt.title("Average Resale Price by Month — Selected Towns")
plt.xlabel("Month"); plt.ylabel("Avg Price (SGD)")
plt.gca().yaxis.set_major_formatter(FuncFormatter(lambda x,_: f"{int(x):,}"))
plt.tight_layout(); plt.show()
No description has been provided for this image

1.3.5 Bar chart — average prices by town or flat type¶

By town (top 10):

In [11]:
import seaborn as sns, matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter

# Top 10 towns by average price
top10 = (df.groupby("town")["resale_price"]
           .mean().sort_values(ascending=False)
           .head(10).reset_index())

plt.figure(figsize=(9,5))
ax = sns.barplot(
    data=top10, y="town", x="resale_price",
    hue="town", dodge=False, legend=False, palette="muted"
)
plt.title("Average Resale Price — Top 10 Towns")
plt.xlabel("Avg Price (SGD)"); plt.ylabel("")
ax.xaxis.set_major_formatter(FuncFormatter(lambda x, _: f"{int(x):,}"))

# --- add data labels ---
for p in ax.patches:
    w = p.get_width()
    y = p.get_y() + p.get_height()/2
    ax.text(w + 2000, y, f"{int(w):,}", va="center", ha="left", fontsize=10)

plt.tight_layout(); plt.show()
No description has been provided for this image
In [12]:
by_type = (df.groupby("flat_type")["resale_price"]
             .mean().sort_values().reset_index())

plt.figure(figsize=(9,4))
ax = sns.barplot(
    data=by_type, x="flat_type", y="resale_price",
    hue="flat_type", dodge=False, legend=False, palette="pastel"
)
plt.title("Average Resale Price by Flat Type")
plt.xlabel(""); plt.ylabel("Avg Price (SGD)")
ax.yaxis.set_major_formatter(FuncFormatter(lambda x, _: f"{int(x):,}"))

# --- add data labels ---
for p in ax.patches:
    h = p.get_height()
    x = p.get_x() + p.get_width()/2
    ax.text(x, h + 2500, f"{int(h):,}", ha="center", va="bottom", fontsize=10)

plt.tight_layout(); plt.show()
No description has been provided for this image

1.3.6 Map chart — geographic price distribution across Singapore¶

In [18]:
import plotly.express as px

town_coords = {
 "ANG MO KIO":(1.3691,103.8454),"BEDOK":(1.3236,103.9305),"BISHAN":(1.3508,103.8485),
 "BUKIT BATOK":(1.3502,103.7490),"BUKIT MERAH":(1.2857,103.8120),"BUKIT PANJANG":(1.3786,103.7616),
 "BUKIT TIMAH":(1.3294,103.8021),"CENTRAL AREA":(1.2903,103.8519),"CHOA CHU KANG":(1.3854,103.7449),
 "CLEMENTI":(1.3151,103.7643),"GEYLANG":(1.3180,103.8830),"HOUGANG":(1.3626,103.8922),
 "JURONG EAST":(1.3331,103.7430),"JURONG WEST":(1.3403,103.7074),"KALLANG/WHAMPOA":(1.3136,103.8640),
 "MARINE PARADE":(1.3030,103.9065),"PASIR RIS":(1.3731,103.9497),"PUNGGOL":(1.4053,103.9020),
 "QUEENSTOWN":(1.2941,103.7866),"SEMBAWANG":(1.4491,103.8201),"SENGKANG":(1.3933,103.8959),
 "SERANGOON":(1.3536,103.8720),"TAMPINES":(1.3531,103.9457),"TOA PAYOH":(1.3346,103.8530),
 "WOODLANDS":(1.4360,103.7865),"YISHUN":(1.4304,103.8353)
}

df_map = (df.groupby("town", as_index=False)["resale_price"].mean()
            .assign(lat=lambda d: d["town"].map(lambda t: town_coords.get(t,(None,None))[0]),
                    lon=lambda d: d["town"].map(lambda t: town_coords.get(t,(None,None))[1]))
            .dropna(subset=["lat","lon"]))

fig = px.scatter_mapbox(df_map, lat="lat", lon="lon", size="resale_price",
                        color="resale_price", color_continuous_scale="Viridis",
                        hover_name="town", hover_data={"resale_price":":,.0f"},
                        size_max=30, zoom=10, height=500)
fig.update_layout(mapbox_style="open-street-map", title="Avg Resale Price by Town (Map)")
fig.show()

1.3.7 Scatter — floor area vs price, colored by town¶

In [13]:
sample = df.sample(6000, random_state=42) if len(df)>6000 else df
plt.figure(figsize=(9,4))
sns.scatterplot(data=sample, x="floor_area_sqm", y="resale_price",
                hue="town", alpha=0.35, s=30, palette="tab20")
plt.title("Floor Area vs Resale Price (colored by Town)")
plt.xlabel("Floor Area (sqm)"); plt.ylabel("Resale Price (SGD)")
plt.gca().yaxis.set_major_formatter(FuncFormatter(lambda x,_: f"{int(x):,}"))
plt.legend(ncol=2, fontsize=8, bbox_to_anchor=(1.02,1), loc="upper left")
plt.tight_layout(); plt.show()
No description has been provided for this image

1.3.8 Heatmap — price per sqm by town × flat type¶

In [14]:
df_ppsqm = df.assign(price_per_sqm = df["resale_price"] / df["floor_area_sqm"])
pivot = (df_ppsqm.groupby(["town","flat_type"])["price_per_sqm"]
         .mean().unstack("flat_type"))

plt.figure(figsize=(12,8))
sns.heatmap(pivot, cmap="YlGnBu", annot=False)
plt.title("Average Price per sqm by Town and Flat Type")
plt.xlabel("Flat Type"); plt.ylabel("Town")
plt.tight_layout(); plt.show()
No description has been provided for this image
In [13]:
import pandas as pd

# Pick the key variables for Slide 7
summary_cols = ["floor_area_sqm", "flat_age_years", "remaining_lease_years", "resale_price"]

# Safety: keep only columns that exist (in case of naming differences)
available = [c for c in summary_cols if c in df.columns]

# Summary stats
summary = df[available].describe().T  # transpose so variables are rows
summary = summary.rename(columns={
    "50%": "median"
})[["count", "mean", "std", "min", "median", "max"]].round(2)

# Missing values per column
missing = df[available].isna().sum().rename("missing")

# Merge for one clean table
summary_table = summary.merge(missing, left_index=True, right_index=True)

summary_table
Out[13]:
count mean std min median max missing
floor_area_sqm 628591.0 96.77 25.05 28.0 96.0 297.0 0
flat_age_years 628591.0 20.12 11.42 -2.0 20.0 57.0 0
remaining_lease_years 628591.0 78.88 11.42 42.0 79.0 99.0 0
resale_price 628591.0 362050.97 164329.97 28000.0 338000.0 1500000.0 0

2. Exploratory Data Analysis (EDA)¶

2.1 Price Trend Over Time (Yearly Average)¶

In [15]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter

# Prepare yearly average price
df_year = df.copy()
df_year["year"] = pd.to_datetime(df_year["month"]).dt.year
yearly = df_year.groupby("year", as_index=False)["resale_price"].mean()

# Plot
plt.figure(figsize=(8,5))
sns.lineplot(data=yearly, x="year", y="resale_price", marker="o", color=sns.color_palette("muted")[0])
plt.title("Average Resale Price by Year")
plt.xlabel("Year")
plt.ylabel("Average Resale Price (SGD)")
plt.gca().yaxis.set_major_formatter(FuncFormatter(lambda x, _: f"{int(x):,}"))
plt.tight_layout()
plt.show()
No description has been provided for this image

2.2 Floor Area vs Resale Price¶

In [16]:
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter

plt.figure(figsize=(8,4))
sns.scatterplot(
    data=df.sample(5000, random_state=42) if len(df) > 5000 else df,  # speed + clarity
    x="floor_area_sqm",
    y="resale_price",
    alpha=0.3,
    hue=None,
    color=sns.color_palette("muted")[1]
)
plt.title("Floor Area vs Resale Price")
plt.xlabel("Floor Area (sqm)")
plt.ylabel("Resale Price (SGD)")
plt.gca().yaxis.set_major_formatter(FuncFormatter(lambda x, _: f"{int(x):,}"))
plt.tight_layout()
plt.show()
No description has been provided for this image

2.3 Correlation Heatmap¶

In [17]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

num_cols = ["floor_area_sqm", "flat_age_years", "remaining_lease_years", "resale_price"]
num_cols = [c for c in num_cols if c in df.columns]

corr = df[num_cols].corr()

plt.figure(figsize=(6,4))
sns.heatmap(
    corr, annot=True, fmt=".2f",
    cmap="crest",    # light, professional
    vmin=-1, vmax=1, linewidths=.5
)
plt.title("Correlation — Key Numerical Features")
plt.tight_layout()
plt.show()
No description has been provided for this image

3. Model Development & Evaluation¶

  • Linear Regression
  • Decision Tree Regressor
  • Random Forest Regressor

3.1 Linear Regression Model¶

In [18]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# --- Drop irrelevant or redundant columns ---
df_model = df.drop(columns=[
    "source_file", 
    "block", 
    "street_name", 
    "remaining_lease"  # text version, replaced by numeric one
])

# --- Encode categorical columns ---
label_cols = ["town", "flat_type", "flat_model", "storey_range"]

le = LabelEncoder()
for col in label_cols:
    df_model[col] = le.fit_transform(df_model[col])

# --- Define X (features) and y (target) ---
X = df_model.drop(columns=["resale_price"])
y = df_model["resale_price"]

# --- Split data into train and test sets (80% train, 20% test) ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("✅ Data prepared successfully!")
print(f"Training set: {X_train.shape}")
print(f"Testing set:  {X_test.shape}")
✅ Data prepared successfully!
Training set: (502872, 10)
Testing set:  (125719, 10)
In [19]:
# Convert datetime columns to numeric (years)
X_train = X_train.copy()
X_test = X_test.copy()

# Extract just the year from date columns
for col in ["month", "lease_commence_date"]:
    if col in X_train.columns:
        X_train[col] = pd.to_datetime(X_train[col]).dt.year
        X_test[col] = pd.to_datetime(X_test[col]).dt.year

# Now fill any missing values caused by conversion
X_train = X_train.fillna(0)
X_test = X_test.fillna(0)

print("✅ Date columns converted to numeric successfully!")
✅ Date columns converted to numeric successfully!
In [20]:
# Ensure all datetime columns are converted to year integers
for col in X_train.columns:
    if np.issubdtype(X_train[col].dtype, np.datetime64):
        X_train[col] = pd.to_datetime(X_train[col]).dt.year.fillna(0).astype(int)
        X_test[col] = pd.to_datetime(X_test[col]).dt.year.fillna(0).astype(int)

# Double-check no datetime columns remain
print("Remaining datetime columns:", X_train.select_dtypes(include=['datetime64']).columns.tolist())
Remaining datetime columns: []
In [21]:
# core imports for modeling
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score

import numpy as np
import pandas as pd

print("Imports OK")
Imports OK
In [22]:
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
Out[22]:
LinearRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearRegression()
In [23]:
# Check which columns still have datetime types
X_train.select_dtypes(include=['datetime64']).columns
Out[23]:
Index([], dtype='object')
In [24]:
# Check which columns still have datetime types
X_train.select_dtypes(include=['datetime64']).columns
Out[24]:
Index([], dtype='object')
In [25]:
X_train.select_dtypes(include=['datetime64']).columns
Out[25]:
Index([], dtype='object')
In [26]:
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
Out[26]:
LinearRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearRegression()
In [27]:
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

# Make predictions
y_pred = lin_reg.predict(X_test)

# Evaluate performance
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
r2 = r2_score(y_test, y_pred)

print(f"Root Mean Squared Error (RMSE): {rmse:,.2f}")
print(f"R² Score: {r2:.4f}")

# Scatter plot to visualize actual vs predicted
plt.figure(figsize=(4,4))
plt.scatter(y_test, y_pred, alpha=0.3, color='royalblue')
plt.xlabel("Actual Resale Price")
plt.ylabel("Predicted Resale Price")
plt.title("Actual vs Predicted Prices — Linear Regression")
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.show()
Root Mean Squared Error (RMSE): 83,858.53
R² Score: 0.7375
No description has been provided for this image

3.2 Decision Tree Regression Model¶

In [28]:
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.metrics import r2_score, mean_squared_error
import matplotlib.pyplot as plt

# --- Train the Decision Tree Model ---
tree_reg = DecisionTreeRegressor(max_depth=10, random_state=42)
tree_reg.fit(X_train, y_train)

# --- Make predictions ---
y_pred_tree = tree_reg.predict(X_test)

# --- Evaluate performance ---
mse_tree = mean_squared_error(y_test, y_pred_tree)
rmse_tree = mse_tree ** 0.5
r2_tree = r2_score(y_test, y_pred_tree)

print(f"Decision Tree RMSE: {rmse_tree:,.2f}")
print(f"Decision Tree R²: {r2_tree:.4f}")

# --- Feature Importance with Color Palette (fixed version) ---
feature_importance = pd.DataFrame({
    "Feature": X_train.columns,
    "Importance": tree_reg.feature_importances_
}).sort_values(by="Importance", ascending=False)

plt.figure(figsize=(8,4))
sns.barplot(
    x="Importance",
    y="Feature",
    hue="Feature",
    data=feature_importance,
    palette="coolwarm",
    legend=False
)
plt.title("Feature Importance — Decision Tree (Coolwarm Palette)")
plt.xlabel("Relative Importance")
plt.ylabel("Features")
plt.tight_layout()
plt.show()
Decision Tree RMSE: 56,853.22
Decision Tree R²: 0.8793
No description has been provided for this image

3.3. Random Forest Regression Model¶

In [37]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# --- Train the Random Forest Model ---
rf_reg = RandomForestRegressor(
    n_estimators=100,       # number of trees
    max_depth=15,           # limits tree depth to prevent overfitting
    random_state=42,
    n_jobs=-1               # use all CPU cores
)
rf_reg.fit(X_train, y_train)

# --- Make predictions ---
y_pred_rf = rf_reg.predict(X_test)

# --- Evaluate performance ---
mse_rf = mean_squared_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mse_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print(f"🌲 Random Forest RMSE: {rmse_rf:,.2f}")
print(f"🌲 Random Forest R²: {r2_rf:.4f}")

# --- Feature Importance Visualization ---
feature_importance_rf = pd.DataFrame({
    "Feature": X_train.columns,
    "Importance": rf_reg.feature_importances_
}).sort_values(by="Importance", ascending=False)

plt.figure(figsize=(8,4))
sns.barplot(
    x="Importance",
    y="Feature",
    hue="Feature",
    data=feature_importance_rf,
    dodge=False,
    palette=sns.color_palette("muted", n_colors=len(feature_importance_rf)),
    legend=False
)
plt.title("Feature Importance — Random Forest (Muted Palette)")
plt.xlabel("Relative Importance")
plt.ylabel("Features")
plt.tight_layout()
plt.show()
🌲 Random Forest RMSE: 38,860.97
🌲 Random Forest R²: 0.9436
No description has been provided for this image

3.4 Model Comparison¶

In [34]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

# --- Gather all model metrics ---
results = pd.DataFrame({
    "Model": ["Linear Regression", "Decision Tree", "Random Forest"],
    "R² Score": [r2, r2_tree, r2_rf],
    "RMSE": [rmse, rmse_tree, rmse_rf]
})

# --- Plot R² comparison (muted palette) ---
plt.figure(figsize=(6,4))
sns.barplot(
    x="Model",
    y="R² Score",
    hue="Model",              # ✅ fixes warning
    data=results,
    palette="muted",
    dodge=False,
    legend=False
)
plt.title("Model Comparison — R² Score")
plt.ylim(0,1)
plt.tight_layout()
plt.show()

# --- Plot RMSE comparison (pastel palette) ---
plt.figure(figsize=(6,4))
sns.barplot(
    x="Model",
    y="RMSE",
    hue="Model",              # ✅ fixes warning
    data=results,
    palette="pastel",
    dodge=False,
    legend=False
)
plt.title("Model Comparison — RMSE (Lower is Better)")
plt.tight_layout()
plt.show()

# --- Display summary table ---
display(results.style.background_gradient(cmap="YlGnBu"))
No description has been provided for this image
No description has been provided for this image
  Model R² Score RMSE
0 Linear Regression 0.737510 83858.526719
1 Decision Tree 0.879350 56853.222920
2 Random Forest 0.943630 38860.968179
In [35]:
# --- Annotated R² Comparison Chart (muted palette) ---
plt.figure(figsize=(5,5))
ax = sns.barplot(
    x="Model",
    y="R² Score",
    hue="Model",
    data=results,
    palette="muted",
    dodge=False,
    legend=False
)
plt.title("Model Comparison — R² Score")
plt.ylim(0, 1)

# Add text labels on each bar
for p in ax.patches:
    ax.text(
        p.get_x() + p.get_width() / 2,
        p.get_height() + 0.02,
        f"{p.get_height():.3f}",
        ha="center", va="bottom", fontsize=10
    )

plt.tight_layout()
plt.show()

# --- Annotated RMSE Comparison Chart (pastel palette) ---
plt.figure(figsize=(5,6))
ax = sns.barplot(
    x="Model",
    y="RMSE",
    hue="Model",
    data=results,
    palette="pastel",
    dodge=False,
    legend=False
)
plt.title("Model Comparison — RMSE (Lower is Better)")

# Add text labels on each bar
for p in ax.patches:
    ax.text(
        p.get_x() + p.get_width() / 2,
        p.get_height() + (0.02 * max(results["RMSE"])),  # small offset
        f"{p.get_height():,.0f}",
        ha="center", va="bottom", fontsize=10
    )

plt.tight_layout()
plt.show()
No description has been provided for this image
No description has been provided for this image
In [31]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

# Example data (replace with your actual results DataFrame)
results = pd.DataFrame({
    "Model": ["Linear Regression", "Decision Tree", "Random Forest"],
    "R² Score": [0.737510, 0.879350, 0.943630],
    "RMSE": [83858.53, 56853.22, 38860.97]
})

# --- R² Comparison Chart (Muted Palette) ---
plt.figure(figsize=(6,4))
ax = sns.barplot(
    x="Model",
    y="R² Score",
    hue="Model",
    data=results,
    palette="muted",
    dodge=False,
    legend=False
)
plt.title("Model Comparison — R² Score", fontsize=13, weight="bold")
plt.ylim(0, 1.05)
plt.ylabel("R² Score")

# Add annotations for exact R² values
for p in ax.patches:
    value = p.get_height()
    ax.text(
        p.get_x() + p.get_width()/2,
        value + 0.02,                # small offset above bar
        f"{value:.3f}",
        ha="center", va="bottom",
        fontsize=10, color="black", weight="bold"
    )

plt.tight_layout()
plt.show()

# --- RMSE Comparison Chart (Pastel Palette) ---
plt.figure(figsize=(6,4))
ax = sns.barplot(
    x="Model",
    y="RMSE",
    hue="Model",
    data=results,
    palette="pastel",
    dodge=False,
    legend=False
)
plt.title("Model Comparison — RMSE (Lower is Better)", fontsize=13, weight="bold")
plt.ylabel("RMSE")

# Add annotations for exact RMSE values
for p in ax.patches:
    value = p.get_height()
    ax.text(
        p.get_x() + p.get_width()/2,
        value + 1500,                # offset based on chart scale
        f"{value:,.0f}",             # comma format, 0 decimals
        ha="center", va="bottom",
        fontsize=10, color="black", weight="bold"
    )

plt.tight_layout()
plt.show()
No description has been provided for this image
No description has been provided for this image
In [36]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# --- Extract feature importances from both models ---
tree_importances = pd.Series(tree_reg.feature_importances_, index=X.columns)
rf_importances   = pd.Series(rf_reg.feature_importances_,   index=X.columns)

# --- Sort top 10 features ---
top_n = 10
tree_top = tree_importances.sort_values(ascending=False).head(top_n)
rf_top   = rf_importances.sort_values(ascending=False).head(top_n)

# --- Create side-by-side plots ---
fig, axes = plt.subplots(1, 2, figsize=(15,5))

# Decision Tree
sns.barplot(
    x=tree_top.values, y=tree_top.index,
    ax=axes[0],
    palette="muted",
    hue=tree_top.index, legend=False, dodge=False
)
axes[0].set_title("Decision Tree — Top Feature Importances", fontsize=13, weight="bold")
axes[0].set_xlabel("Importance")
axes[0].set_ylabel("Feature")

# Annotate importance values
for i, v in enumerate(tree_top.values):
    axes[0].text(v + 0.002, i, f"{v:.3f}", va="center", fontsize=9)

# Random Forest
sns.barplot(
    x=rf_top.values, y=rf_top.index,
    ax=axes[1],
    palette="pastel",
    hue=rf_top.index, legend=False, dodge=False
)
axes[1].set_title("Random Forest — Top Feature Importances", fontsize=13, weight="bold")
axes[1].set_xlabel("Importance")
axes[1].set_ylabel("")

for i, v in enumerate(rf_top.values):
    axes[1].text(v + 0.002, i, f"{v:.3f}", va="center", fontsize=9)

plt.tight_layout()
plt.show()
No description has been provided for this image
In [32]:
import pandas as pd

feature_importance_rf = pd.DataFrame({
    "Feature": X_train.columns,
    "Importance": rf_reg.feature_importances_
}).sort_values("Importance", ascending=False).head(10)

plt.figure(figsize=(8,4))
sns.barplot(
    x="Importance", y="Feature",
    hue="Feature", data=feature_importance_rf,
    palette="pastel", dodge=False, legend=False
)
plt.title("Random Forest — Top Feature Importances")
plt.tight_layout()
plt.show()
No description has been provided for this image
In [ ]: